keep = ls()

#Read in persons
in_veterans_cwdb = fread(paste0(cwdb_path, '/person_table.csv')) %>% 
  .[stateserved %in% c("IA", "WI")]

#get enlist date
enlist_dates = in_veterans_cwdb[, list(personpk, enlistdate)]

#Read in roster 
in_veterans_roster = fread(paste0(cwdb_path, '/regiment_roster_table.csv')) %>% 
  .[personpk %in% in_veterans_cwdb$personpk]

#Keep first assignment
setkey(in_veterans_roster, personpk, outdate)
in_veterans_roster = in_veterans_roster[, list(regimentpk = regimentpk[1],
                                               unitcompany = unitcompany[1],
                                               indate = mustindate[1],
                                               discharged = any(outmethodsimple %in% "Discharged"),
                                               disabled = any(outmethodsimple %in% "Disabled"),
                                               deserted = any(outmethodsimple %in% "Deserted"),
                                               drafted = any(inmethodsimple %in% "Drafted"),
                                               outmethodsimple = outmethodsimple[1],
                                               outdate = outdate[1]) , by = personpk]

#Ger regiments
reg = fread(paste0(cwdb_path,'/regiments.csv')) %>% .[regimentpk %in% in_veterans_roster$regimentpk]
reg = reg[, list(regimentpk, term_sort, type)] %>% unique

#Merge in type of regiment
setkey(in_veterans_roster, regimentpk)
setkey(reg, regimentpk)

in_veterans_roster = reg[in_veterans_roster]

#Merge in enlistdate
setkey(in_veterans_roster, personpk)
setkey(enlist_dates, personpk)

in_veterans_roster = enlist_dates[in_veterans_roster]

#Unique persons, unique regiments
all_use_persons = in_veterans_cwdb$personpk %>% unique
all_use_regiments = in_veterans_roster[personpk %in% all_use_persons, unique(regimentpk)]


############################
#Get within-unit experience#
############################

in_veterans_roster[, died := outmethodsimple %in% c('Died of disease', 'Died POW', 'Killed/Died of wounds')]
in_veterans_roster[, kia := outmethodsimple %in% c('Died POW', 'Killed/Died of wounds')]
in_veterans_roster[, wounded := personpk %in% in_veterans_cwdb[waswounded == 'Y', personpk]]
in_veterans_roster[, indate := strptime(indate, "%Y-%m-%d") %>% as.Date]
in_veterans_roster[, enlistdate := strptime(enlistdate, "%Y-%m-%d")  %>% as.Date]
in_veterans_roster[, outdate := strptime(outdate, "%Y-%m-%d")  %>% as.Date]

#Fill missing enlistdates
in_veterans_roster[is.na(enlistdate), enlistdate := indate]
in_veterans_roster[, median_edate := median(enlistdate, na.rm = T) , by = regimentpk]
in_veterans_roster[is.na(enlistdate), enlistdate := median_edate]
in_veterans_roster[, median_edate := NULL]
in_veterans_roster[, expected_out := enlistdate + (term_sort - 10000)]

#Make company
in_veterans_roster[unitcompany != "", c('companypk') := list(.GRP) , by = list(regimentpk, unitcompany)]

#Get usable sample
in_veterans_roster[, useTime := !is.na(enlistdate) & !is.na(expected_out)]
in_veterans_roster[, useCompany := !is.na(companypk)]


#Get company casualty rate
##########################
a = in_veterans_roster[(useTime) & (useCompany), list(personpk, companypk, died, kia, disabled, enlistdate, expected_out)]
b = in_veterans_roster[(useTime) & (useCompany), list(personpk, companypk, died, kia, disabled, enlistdate, expected_out)]

setkey(b, companypk, enlistdate, expected_out)

#overlap soldiers serving in same company at same time
#count deaths/diabled
company_casualty_rate = foverlaps(na.omit(a), na.omit(b), by.x = c('companypk', 'enlistdate', 'expected_out'), by.y = c('companypk', 'enlistdate', 'expected_out')) %>% 
  .[personpk != i.personpk, list(in_company_n = .N, 
                                 company_deaths = sum(i.died ),
                                 company_kia = sum(i.kia ),
                                 company_disabled = sum(i.disabled )), by = list(personpk, companypk)]

rm(list = c('a', 'b'))
gc()


#Get regimental casualty rate
#############################
#overlap soldiers serving in same regiment at same time
#count deaths/diabled
#requires very large amount of RAM

if (F) {

a = in_veterans_roster[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, died, disabled, enlistdate, expected_out)]
b = in_veterans_roster[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, died, disabled, enlistdate, expected_out)]

setkey(b, regimentpk, enlistdate, expected_out)

regiment_casualty_rate = foverlaps(na.omit(a), na.omit(b), by.x = c('regimentpk',  'enlistdate', 'expected_out'), by.y = c('regimentpk',  'enlistdate', 'expected_out')) %>% 
  .[personpk != i.personpk, list(in_regiment_n = .N, 
                                 deaths = sum(i.died ),
                                 disabled = sum(i.disabled )), by = list(personpk, regimentpk)]

rm(list = c('a', 'b'))
gc()
}

################################################
#Get alternate regimental casualty / combat data
################################################

#Get regimental casualty records
regcas = fread(paste0(cwdb_path, '/regcas.csv'))[regimentpk %in% all_use_regiments]
regcas[, enlistdate := strptime(date, "%Y-%m-%d") %>% as.Date]
regcas[, expected_out := enlistdate]


a = in_veterans_roster[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, enlistdate, expected_out)]
b = regcas[!is.na(enlistdate) & !is.na(regimentpk), list(regimentpk,enlistdate, expected_out, killed, wounded, pow, missing, battlenum)]

setkey(b, regimentpk, enlistdate, expected_out)

#Get overlap of soldiers with regimental reported casualties
regiment_combat_data = foverlaps(a, b, by.x = c('regimentpk', 'enlistdate', 'expected_out'), by.y = c('regimentpk', 'enlistdate', 'expected_out')) %>%
  .[, list(combat_days = na.omit(enlistdate) %>% unique %>% length,
           kia = sum(killed, na.rm = T),
           wia = sum(wounded, na.rm = T),
           pow = sum(pow, na.rm = T), 
           mia = sum(missing, na.rm = T),
           battles = na.omit(battlenum) %>% unique %>% length), by = list(personpk, regimentpk)]

rm(list = c('a', 'b'))
gc()

#######################
#Combat alongside USCT#
#######################

regiments = fread( paste0(cwdb_path,'/regiments.csv'))
regcas = fread(paste0(cwdb_path,'/regcas.csv'))

regcas[, enlistdate := strptime(date, "%Y-%m-%d")  %>% as.Date]
regcas[, expected_out := enlistdate ]

#USCT combat data
usct_regiments = regiments[state %in% "UC", regimentpk]
regcas_usct = fread(paste0(cwdb_path,'/regcas.csv'))[regimentpk %in% usct_regiments]
regcas_usct[, enlistdate := strptime(date, "%Y-%m-%d") %>% as.Date]
regcas_usct[, expected_out := enlistdate]

usct_combat = regcas_usct[!is.na(enlistdate) & place != "", list(enlistdate, expected_out, place, usct_combat = 1, regimentpk)]
usct_combat = usct_combat[, list(usct_combat = sum(usct_combat)), by = list(enlistdate, expected_out, place)]

setkey(regcas, enlistdate, expected_out, place)
setkey(usct_combat, enlistdate, expected_out, place)

regcas_usct_combat = usct_combat[regcas, allow.cartesian = T][!is.na(enlistdate) & place != ""]

#Get personal USCT combat experience
a = in_veterans_roster[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, enlistdate, expected_out)]
b = regcas_usct_combat[!is.na(enlistdate) & !is.na(regimentpk), list(regimentpk, enlistdate, expected_out, usct_combat)]

setkey(b, regimentpk, enlistdate, expected_out)

regiment_usct_combat_data = foverlaps(a, b, by.x = c('regimentpk', 'enlistdate', 'expected_out'), by.y = c('regimentpk', 'enlistdate', 'expected_out')) %>%
  .[, list(usct_combat_regiments = sum(usct_combat, na.rm = T),
           usct_combat_days = sum(usct_combat > 0, na.rm = T)), by = list(personpk, regimentpk)]

rm(list = c('a', 'b', 'regcas_usct_combat', 'usct_combat', 'regcas', 'regcas_usct'))
gc()


############################
#Time in Mixed Race Brigade#
############################

#Load regimental assignemtns
reg_assign = fread(paste0(cwdb_path,'/assignus.csv'))[regimentpk %in% c(usct_regiments, all_use_regiments)]
reg_assign = reg_assign[, list(regimentpk,fromdatel, fromyear, frommonth, todatel, toyear, tomonth, assignnum, brigade, division, corps, army, armycode, armyname)]

reg_assign[, u_corps := paste(army, corps, sep = ":")]
reg_assign[, u_division := paste(u_corps, division, sep = ":")]
reg_assign[, u_brigade := paste(u_division, brigade, sep = ":")]
reg_assign[, u_brigade_alt := paste(u_corps, brigade, sep = ":")]

#Clean dates
reg_assign[, from_date := as.Date(strptime(fromdatel, format = '%Y-%m-%d'))]
reg_assign[, to_date := as.Date(strptime(todatel, format = '%Y-%m-%d'))]
reg_assign[!is.na(from_date) & is.na(to_date), to_date_flag := 1]
reg_assign[!is.na(from_date) & is.na(to_date), to_date := as.Date(paste(toyear,tomonth,'01',sep = '-')) - 1]

reg_assign[is.na(from_date) & !is.na(to_date), from_date_flag := 1]
reg_assign[is.na(from_date) & !is.na(to_date), from_date := as.Date(paste(fromyear,frommonth,'01',sep = '-'))]

from_to_idx = is.na(reg_assign$from_date) & is.na(reg_assign$to_date)
reg_assign[from_to_idx, from_to_date_flag := 1]
reg_assign[from_to_idx, from_date := as.Date(paste(fromyear,frommonth,'01',sep = '-'))]
reg_assign[from_to_idx, to_date := as.Date(paste(toyear,tomonth,'01',sep = '-')) - 1]

#Fix to_date imputation when in same month
reg_assign[to_date < from_date & !is.na(to_date_flag) & fromyear == toyear & frommonth==tomonth,
           to_date := as.Date(paste(toyear,tomonth+1,'01',sep = '-')) - 1]

reg_assign[to_date < from_date & !is.na(from_to_date_flag) & fromyear == toyear & frommonth==tomonth,
           to_date := as.Date(paste(toyear,tomonth+1,'01',sep = '-')) - 1]


reg_assign = reg_assign[from_date < to_date, list(date = seq.Date(from_date, to_date, by = 'day'),
                                                  army,
                                                  u_corps,
                                                  u_division,
                                                  u_brigade,
                                                  to_date_flag,
                                                  from_date_flag,
                                                  from_to_date_flag,
                                                  from_date,
                                                  to_date), by = list(regimentpk, assignnum)]

#Brigade assignments
unit_assignments = reg_assign[regimentpk %in% setdiff(all_use_regiments, usct_regiments), list(regimentpk, date, u_brigade)]
all_assignments = reg_assign[, list(regimentpk_j = regimentpk, date, u_brigade)]
brigade_composition = all_assignments[, list(regiments = length(regimentpk_j), usct_regiments = sum(regimentpk_j %in% usct_regiments)), by = list(u_brigade, date)]

setkey(unit_assignments, u_brigade, date)
setkey(brigade_composition, u_brigade, date)

brigade_match = brigade_composition[unit_assignments, allow.cartesian = T]
brigade_match[, enlistdate := strptime(as.character(date), "%Y-%m-%d") %>% as.Date]
brigade_match[, expected_out := enlistdate]

#Get personal time in brigade assignments
a = in_veterans_roster[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, enlistdate, expected_out)]
b = brigade_match[!is.na(expected_out) & !is.na(regimentpk), list(regimentpk, enlistdate, expected_out, usct_regiments)]

setkey(b, regimentpk, enlistdate, expected_out)

regiment_usct_brigade_data = foverlaps(a, b, by.x = c('regimentpk', 'enlistdate', 'expected_out'), by.y = c('regimentpk', 'enlistdate', 'expected_out')) %>%
  .[, list(usct_brigade_r = sum(usct_regiments, na.rm = T),
           usct_brigade_any = sum(usct_regiments > 0, na.rm = T),
           assign_days = .N, 
           u_assign_days = unique(enlistdate) %>% length), by = list(personpk, regimentpk)]

rm(list = c('a', 'b', 'unit_assignments', 'brigade_composition', 'all_assignments', 'brigade_match', 'reg_assign', 'from_to_idx'))
gc()




#################################
#Merge experience to roster data#
#################################

#Merge in company casualty rates
setkey(in_veterans_roster, personpk, companypk)
setkey(company_casualty_rate, personpk, companypk)
in_veterans_roster = company_casualty_rate[in_veterans_roster]

#regiment combat experience
setkey(in_veterans_roster, personpk, regimentpk)
setkey(regiment_combat_data, personpk, regimentpk)

in_veterans_roster = regiment_combat_data[in_veterans_roster]

#regiment casualty rates
if (F) {
setkey(in_veterans_roster, personpk, regimentpk)
setkey(regiment_casualty_rate, personpk, regimentpk)

in_veterans_roster = regiment_casualty_rate[in_veterans_roster]
}

#regiment usct combat experience
setkey(in_veterans_roster, personpk, regimentpk)
setkey(regiment_usct_combat_data, personpk, regimentpk)

in_veterans_roster = regiment_usct_combat_data[in_veterans_roster]

#regiment interracial brigades
setkey(in_veterans_roster, personpk, regimentpk)
setkey(regiment_usct_brigade_data, personpk, regimentpk)

in_veterans_roster = regiment_usct_brigade_data[in_veterans_roster]

rm(list = c( 'company_casualty_rate', 
             'regiment_combat_data', 'regiment_usct_combat_data', 'regiment_usct_brigade_data'))
gc()

#Center based on year of enlistment, state served, enlistment term, type of unit, method of muster
in_veterans_roster[, enlist_year := year(enlistdate)]
in_veterans_roster[, stateserved := match(regimentpk, regiments$regimentpk) %>% regiments[., state]]
in_veterans_roster = in_veterans_roster[stateserved %in% c("IA", "WI")]
in_veterans_roster[, company_casualty_rate := company_deaths / in_company_n ]
in_veterans_roster[, company_kia_rate := company_kia / in_company_n ]
#in_veterans_roster[, regiment_casualty_rate := deaths / in_regiment_n ]
in_veterans_roster[, reg_control := paste(stateserved, enlist_year, term_sort, type, drafted, sep = ":")]
in_veterans_roster[, co_control := paste(enlist_year, regimentpk, sep = ":")]

co_cols = c('company_casualty_rate', "company_kia_rate", "company_kia")
reg_cols = c( "combat_days", "usct_combat_days", "battles", 'usct_brigade_any')
in_veterans_roster[, c(co_cols) := lapply(.SD, function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)), .SDcols = co_cols]
in_veterans_roster[, c(reg_cols) := lapply(.SD, function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)), .SDcols = reg_cols]
in_veterans_roster[, paste0("c_", co_cols) :=  lapply(.SD, function(x) (x - mean(x, na.rm = T))), by = co_control, .SDcols = co_cols]
in_veterans_roster[, paste0("c_", reg_cols) := lapply(.SD, function(x) (x - mean(x, na.rm = T))), by = reg_control, .SDcols = reg_cols]
in_veterans_roster[, m_company_casualty_rate := mean(company_casualty_rate, na.rm = T), by = co_control]
in_veterans_roster[, m_company_kia := mean(company_kia, na.rm = T), by = co_control]


out = in_veterans_roster[, .SD, .SDcols = c('personpk', paste0("c_", co_cols), paste0("c_", reg_cols))]
fwrite(out, './cleaned/cwdb_treatment_data_ia_wi.csv')

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()